From: Jan Beulich <jbeulich@suse.com>
Subject: IOMMU: hold page ref until after deferred TLB flush

When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB
flush for the "from" GFN range requires that the page remains allocated
to the guest until the TLB flush has actually occurred. Otherwise a
parallel hypercall to remove the page would only flush the TLB for the
GFN it has been moved to, but not the one is was mapped at originally.

This is part of XSA-346.

Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
Reported-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Julien Grall <jgrall@amazon.com>

--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -1407,7 +1407,7 @@ void share_xen_page_with_guest(struct pa
 int xenmem_add_to_physmap_one(
     struct domain *d,
     unsigned int space,
-    union xen_add_to_physmap_batch_extra extra,
+    union add_to_physmap_extra extra,
     unsigned long idx,
     gfn_t gfn)
 {
@@ -1480,10 +1480,6 @@ int xenmem_add_to_physmap_one(
         break;
     }
     case XENMAPSPACE_dev_mmio:
-        /* extra should be 0. Reserved for future use. */
-        if ( extra.res0 )
-            return -EOPNOTSUPP;
-
         rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx));
         return rc;
 
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4617,7 +4617,7 @@ static int handle_iomem_range(unsigned l
 int xenmem_add_to_physmap_one(
     struct domain *d,
     unsigned int space,
-    union xen_add_to_physmap_batch_extra extra,
+    union add_to_physmap_extra extra,
     unsigned long idx,
     gfn_t gpfn)
 {
@@ -4701,9 +4701,20 @@ int xenmem_add_to_physmap_one(
         rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K);
 
  put_both:
-    /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */
+    /*
+     * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top.
+     * We also may need to transfer ownership of the page reference to our
+     * caller.
+     */
     if ( space == XENMAPSPACE_gmfn )
+    {
         put_gfn(d, gfn);
+        if ( !rc && extra.ppage )
+        {
+            *extra.ppage = page;
+            page = NULL;
+        }
+    }
 
     if ( page )
         put_page(page);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -814,13 +814,12 @@ int xenmem_add_to_physmap(struct domain
 {
     unsigned int done = 0;
     long rc = 0;
-    union xen_add_to_physmap_batch_extra extra;
+    union add_to_physmap_extra extra = {};
+    struct page_info *pages[16];
 
     ASSERT(paging_mode_translate(d));
 
-    if ( xatp->space != XENMAPSPACE_gmfn_foreign )
-        extra.res0 = 0;
-    else
+    if ( xatp->space == XENMAPSPACE_gmfn_foreign )
         extra.foreign_domid = DOMID_INVALID;
 
     if ( xatp->space != XENMAPSPACE_gmfn_range )
@@ -835,7 +834,10 @@ int xenmem_add_to_physmap(struct domain
     xatp->size -= start;
 
     if ( is_iommu_enabled(d) )
+    {
        this_cpu(iommu_dont_flush_iotlb) = 1;
+       extra.ppage = &pages[0];
+    }
 
     while ( xatp->size > done )
     {
@@ -847,8 +849,12 @@ int xenmem_add_to_physmap(struct domain
         xatp->idx++;
         xatp->gpfn++;
 
+        if ( extra.ppage )
+            ++extra.ppage;
+
         /* Check for continuation if it's not the last iteration. */
-        if ( xatp->size > ++done && hypercall_preempt_check() )
+        if ( (++done > ARRAY_SIZE(pages) && extra.ppage) ||
+             (xatp->size > done && hypercall_preempt_check()) )
         {
             rc = start + done;
             break;
@@ -858,6 +864,7 @@ int xenmem_add_to_physmap(struct domain
     if ( is_iommu_enabled(d) )
     {
         int ret;
+        unsigned int i;
 
         this_cpu(iommu_dont_flush_iotlb) = 0;
 
@@ -866,6 +873,15 @@ int xenmem_add_to_physmap(struct domain
         if ( unlikely(ret) && rc >= 0 )
             rc = ret;
 
+        /*
+         * Now that the IOMMU TLB flush was done for the original GFN, drop
+         * the page references. The 2nd flush below is fine to make later, as
+         * whoever removes the page again from its new GFN will have to do
+         * another flush anyway.
+         */
+        for ( i = 0; i < done; ++i )
+            put_page(pages[i]);
+
         ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done,
                                 IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified);
         if ( unlikely(ret) && rc >= 0 )
@@ -879,6 +895,8 @@ static int xenmem_add_to_physmap_batch(s
                                        struct xen_add_to_physmap_batch *xatpb,
                                        unsigned int extent)
 {
+    union add_to_physmap_extra extra = {};
+
     if ( unlikely(xatpb->size < extent) )
         return -EILSEQ;
 
@@ -890,6 +908,19 @@ static int xenmem_add_to_physmap_batch(s
          !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) )
         return -EFAULT;
 
+    switch ( xatpb->space )
+    {
+    case XENMAPSPACE_dev_mmio:
+        /* res0 is reserved for future use. */
+        if ( xatpb->u.res0 )
+            return -EOPNOTSUPP;
+        break;
+
+    case XENMAPSPACE_gmfn_foreign:
+        extra.foreign_domid = xatpb->u.foreign_domid;
+        break;
+    }
+
     while ( xatpb->size > extent )
     {
         xen_ulong_t idx;
@@ -902,8 +933,7 @@ static int xenmem_add_to_physmap_batch(s
                                                extent, 1)) )
             return -EFAULT;
 
-        rc = xenmem_add_to_physmap_one(d, xatpb->space,
-                                       xatpb->u,
+        rc = xenmem_add_to_physmap_one(d, xatpb->space, extra,
                                        idx, _gfn(gpfn));
 
         if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) )
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -588,8 +588,22 @@ void scrub_one_page(struct page_info *);
                       &(d)->xenpage_list : &(d)->page_list)
 #endif
 
+union add_to_physmap_extra {
+    /*
+     * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs
+     * to be kept until after the flush, so the page can't get removed from
+     * the domain (and re-used for another purpose) beforehand. By passing
+     * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants
+     * to have ownership of such a reference transferred in the success case.
+     */
+    struct page_info **ppage;
+
+    /* XENMAPSPACE_gmfn_foreign */
+    domid_t foreign_domid;
+};
+
 int xenmem_add_to_physmap_one(struct domain *d, unsigned int space,
-                              union xen_add_to_physmap_batch_extra extra,
+                              union add_to_physmap_extra extra,
                               unsigned long idx, gfn_t gfn);
 
 int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
